knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(), fig.width=16)
options(scipen=999)
library(tidyverse)
── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.0 ✔ purrr 0.3.1
✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
✔ tidyr 0.8.3 ✔ stringr 1.4.0
✔ readr 1.3.1 ✔ forcats 0.4.0
── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
read_basfiles <- function(bas_filepath) {
bas_file <- read_tsv(bas_filepath,
col_types = cols_only('sample' = 'c',
'platform' = 'c',
'average_quality_of_mapped_bases' = col_guess()))
return( bas_file)
}
bas_files <- list.files(path = "./data/bas",
pattern="*bam.bas",
full.names=TRUE)
bas_dt <-
map_dfr(bas_files, read_basfiles ) %>%
rename(avg_qual="average_quality_of_mapped_bases")
bas_dt
bas_dt <-
bas_dt %>%
group_by(sample) %>%
mutate(id=1:n()) %>%
select(-platform)%>%
spread(id,avg_qual)
bas_dt
n_not_na <- function(x) {
sum(!is.na(x))
}
bas_dt <-
bas_dt %>%
ungroup()%>%
mutate( avg_qual_sd =pmap_dbl(select(., -sample), lift_vd(sd,na.rm =TRUE )),
avg_qual_mean = rowMeans(select(., -sample),na.rm =TRUE),
avg_qual_n= pmap_dbl(select(., -sample), lift_vd(n_not_na)),
avg_qual_min= pmap_dbl(select(., -sample), lift_vd(min, na.rm =TRUE)),
avg_qual_max= pmap_dbl(select(., -sample), lift_vd(max, na.rm =TRUE))) %>%
select (sample, avg_qual_n, avg_qual_min,avg_qual_max,avg_qual_mean,avg_qual_sd, everything() )
ped <-
read_tsv("./data/metadata/20130606_g1k.ped")%>%
rename(sample="Individual ID")
Parsed with column specification:
cols(
`Family ID` = col_character(),
`Individual ID` = col_character(),
`Paternal ID` = col_character(),
`Maternal ID` = col_character(),
Gender = col_double(),
Phenotype = col_double(),
Population = col_character(),
Relationship = col_character(),
Siblings = col_character(),
`Second Order` = col_character(),
`Third Order` = col_character(),
`Other Comments` = col_character()
)
ped
bas_dt <- inner_join( ped %>%
select(sample, Population),
bas_dt,
by="sample")
bas_dt
bas_filepath <- "./data/metadata/average_quality_dt.txt"
write_tsv(bas_dt, bas_filepath)
Plot
ggplot(bas_dt,
aes(x=avg_qual_min,
y=avg_qual_max,
colour=avg_qual_n )) +
geom_point(size=2)
ggplot(bas_dt,
aes(x=avg_qual_mean,
y=avg_qual_sd,
colour=avg_qual_n )) +
geom_point(size=2)
Warning: Removed 366 rows containing missing values (geom_point).